First, I import all the necessary packages installed with the pip tool. Than I set the constants that get used throughout the whole workbook. I also call necessary functions of the libraries to work the way I want, and create useful wrapper functions.
import geopandas as gpd
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly
import matplotlib.pylab as plt
DATA_DIR = "data"
FIG_DIR = "figures"
plotly.offline.init_notebook_mode()
def save_figure(figure, name):
figure.savefig(f"{FIG_DIR}/{name}.png", bbox_inches="tight")
The first is the gejson used by the map at the end. Than I load both dataframes. I add an extra column to the noise complaint frame for ease of joining with the other frame.
Please keep in mind, that San Francisco Government might change the url of the sources and they do not offer permalinks i could include here.
california = gpd.read_file(f"{DATA_DIR}/ca_geo.json")
raw_noise_df = pd.read_csv(f"{DATA_DIR}/Aircraft_Noise_Complaint_Data.csv")
raw_landing_df = pd.read_csv(f"{DATA_DIR}/Air_Traffic_Landings_Statistics.csv")
raw_noise_df["Period"] = raw_noise_df["Year"] * 100 + raw_noise_df["Month"]
In this section I rename columns to a shorter version. I also prefilter the landing data based on it's time period to match the other data frame's period. Than I show a simple line plot for the total complaints to see if there is a rather inactive period.
landing_df = raw_landing_df[["Activity Period", "Operating Airline IATA Code", "GEO Region", "Landing Aircraft Type", "Aircraft Body Type", "Aircraft Model", "Landing Count"]]\
.rename(columns={"Activity Period": "Period", "Operating Airline IATA Code": "Airline", "GEO Region": "From", "Landing Aircraft Type": "Type", "Aircraft Body Type": "Body", "Aircraft Model": "Model"})\
.query("Period <= 201907")
noise_df = raw_noise_df[["Period", "Community", "Total Complaints"]]
ax = sns.lineplot(noise_df, x="Period", y="Total Complaints")
save_figure(ax.get_figure(), "fig1")
As seen above, complaints are not really frequent before 2015. Let's have a closer look with a barplot, to find if april of 2015 seems like a good starting point.
plt.figure(figsize=(20, 3))
plt.xticks(rotation=90)
ax = sns.barplot(noise_df.groupby("Period", as_index=False).sum("Total Complaints").query("Period > 201504"), x="Period", y="Total Complaints")
save_figure(ax.get_figure(), "fig2")
As that seems like a good timeframe, I query both dataframes for this time period. After that, I create a barplot to see if the landing counts are any interesting on their own.
landing_df = landing_df.query("Period > 201504")
noise_df = noise_df.query("Period > 201504")
complaints_over_periods_df = noise_df.groupby("Period", as_index=False).sum("Total Complaints")
plt.figure(figsize=(20, 3))
plt.xticks(rotation=90)
ax = sns.barplot(landing_df[["Period", "Landing Count"]].groupby("Period", as_index=False).sum(), x="Period", y="Landing Count")
save_figure(ax.get_figure(), "fig3")
Turns out, that there is a slight repeating trend throughout the year, where february has the lowest, and august the highest traffic. The years itself seem pretty identical.
data = complaints_over_periods_df.sort_values(by=["Period"], ascending=False)
def period_totalcomplaints_horizontal_barplot(ax):
ax.barh([i for i in range(len(data["Period"]))], data["Total Complaints"], align="edge", color="green")
def pivot_with_complaints(pivot):
fig, (ax1, ax2) = plt.subplots(1, 2, width_ratios=[1.5, 1], sharey=True, gridspec_kw={"wspace":0})
fig.set_size_inches(20, 9)
sns.heatmap(pivot, ax=ax1, cbar_kws={"location":"left", "pad":0.18}, robust=True, linewidths=0.02, linecolor="black")
period_totalcomplaints_horizontal_barplot(ax2)
return fig
def landing_column_heatmap_with_complaints(col, filter_arr=None, period_start=None):
data = landing_df[["Period", col, "Landing Count"]]
if filter_arr is not None:
data = data.query(f"{col} in {filter_arr}")
if period_start is not None:
data = data.query(f"Period >= {period_start}")
return pivot_with_complaints(data.groupby(["Period", col], as_index=False).sum().pivot(index="Period", columns=col, values="Landing Count"))
airline_pivot = \
landing_df[["Period", "Airline", "Landing Count"]].groupby(["Period", "Airline"], as_index=False).sum()\
.pivot(index="Period", columns="Airline", values="Landing Count")
plt.figure(figsize=(20, 8))
plt.xticks(rotation=90)
ax = sns.heatmap(airline_pivot, vmax=700, cmap=sns.blend_palette(["#452103", "#F26419"], as_cmap=True))
save_figure(ax.get_figure(), "fig4")
fig = landing_column_heatmap_with_complaints("Airline", ["AA", "AC", "AS", "B6", "CP", "DL", "F9", "OO", "QK", "QX", "UA", "US", "VX", "WN"])
save_figure(fig, "fig5")
fig = landing_column_heatmap_with_complaints("From")
save_figure(fig, "fig6")
fig = landing_column_heatmap_with_complaints("Type")
save_figure(fig, "fig7")
fig = landing_column_heatmap_with_complaints("Body")
save_figure(fig, "fig8")
fig = landing_column_heatmap_with_complaints("Model", filter_arr=["A319", "A321", "A320", "B738", "B739", "B752", "B753", "B772", "B773", "B789", "CRJ2", "CRJ7", "E170", "E175"], period_start="201603")
save_figure(fig, "fig9")
e175_df = landing_df.query("Model == 'E175' & Period != 201712")[["Period", "Landing Count"]].groupby("Period", as_index=False).sum().merge(noise_df, on="Period")
e175_df["Period"] = e175_df["Period"].apply(lambda x: str(x))
active_communities = e175_df[["Period", "Community", "Total Complaints"]].groupby(by=["Period", "Community"], as_index=False).max().query("`Total Complaints` > 10000")["Community"].drop_duplicates()
fig, ax1 = plt.subplots()
fig.set_size_inches(20, 5)
#plt.figure(figsize=(20, 5))
plt.xticks(rotation=90)
ax1.set_ylabel("Total Complaints")
sns.lineplot(data=e175_df.query("Community in @active_communities").pivot(index="Period", columns="Community", values="Total Complaints"), ax=ax1)
plt.legend(bbox_to_anchor=(1.06, 1), loc='upper left', borderaxespad=0)
ax2 = ax1.twinx()
ax2.set_ylabel("Landing Count of E175")
sns.lineplot(data=e175_df[["Period", "Landing Count"]].drop_duplicates(), x="Period", y="Landing Count", ax=ax2, color="red", markers=True)
save_figure(fig, "fig10")
useful_cities_geof = california.query("name in @active_communities")
data = e175_df.query("Period > '201808'")
fig = px.choropleth_mapbox(data,
geojson=useful_cities_geof,
color="Total Complaints",
range_color=[0, data.get("Total Complaints").max()],
opacity=0.5,
animation_frame="Period",
locations="Community",
featureidkey="properties.name",
zoom=8.5,
center={"lat": 37.766801, "lon": -122.176573},
mapbox_style="open-street-map", width=900, height=900)
fig.write_html(f"{FIG_DIR}/map.html")
plotly.offline.iplot(fig)